# @hidden_cell
# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.
from project_lib import Project
project = Project(project_id='...', project_access_token='...')


# Define required imports
import pandas as pd
import numpy as np
from ast import literal_eval
from collections import defaultdict

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.cluster import homogeneity_score, completeness_score, v_measure_score


# Define get data file function
def get_file_handle(fname):
    # Project data path for the raw data file
    data_path = project.get_file(fname)
    data_path.seek(0)
    return data_path


# Define filenames
DATA_PATHS = ['themes.csv', 'groups_of_themes.csv']

# Use pandas to read the data 
df, groups_of_themes = [pd.read_csv(get_file_handle(data_path)) for data_path in DATA_PATHS]


# Check loaded dataframe
df.head()


# Check loaded dataframe
groups_of_themes.head()


# Convert groups_of_themes to list of lists
groups_of_themes['group'] = groups_of_themes['group'].apply(lambda x: literal_eval(x))
list_of_groups = groups_of_themes.group.values.tolist()


def get_top_n_terms_per_cluster(km_model, terms, n=5):
    """
    Gets the top terms used to cluster text
    
    :param km_model: KMeans model
    :param terms: list of terms from a TfidfVecotrizer object
    :return: dictionary mapping cluster number to top n terms
             {cluster_number: [term1, term2,..., termn]}
    """
    cluster_terms = defaultdict(list)
    
    order_centroids = km_model.cluster_centers_.argsort()[:, ::-1]
    for i in range(len(order_centroids)):
        cluster = order_centroids[i]
        for term_idx in cluster[:n]:
            cluster_terms[i].append(terms[term_idx])
            
    return cluster_terms


def run_kmeans(number_of_clusters, tfidf_matrix):
    """
    :param number_of_clusters: int
    :param tfidf_matrix: matrix from TfidfVectorizer object
    :return: KMeans model, list of cluster labels
    """
    km_model = KMeans(n_clusters=number_of_clusters, init='k-means++')
    km_model.fit(tfidf_matrix.toarray())
    clusters = km_model.labels_.tolist()
    return km_model, clusters


def run_model(X, number_of_clusters=None, number_of_terms=5, max_number_of_groups=5):
    """
    Runs the entire modeling process
    1. create TFIDF matrix
    2. run KMeans with TFIDF matrix
    3. get top terms used
    
    :return: (list of cluster assignments, 
              dictionary mapping cluster number of terms)
    """
    # First find TFIDF matrix
    tfidf_vectorizer = TfidfVectorizer(max_df=0.75 if len(X)>1 else 1, 
                                       min_df=0.1 if len(X)>1 else 1,
                                       stop_words='english',
                                       use_idf=True, 
                                       ngram_range=(1,3),
                                      )

    tfidf_matrix = tfidf_vectorizer.fit_transform(X)
    terms = tfidf_vectorizer.get_feature_names()
    
    # Number of clusters must be > 2 for silhouette_score to work.
    # If there are 2 or less comments, then just set to number of comments.
    number_of_clusters = len(X) if len(X) <= 2 else number_of_clusters
    
    if number_of_clusters:
        # If there's a specific number of clusters specified, then run with that number.
        km_model, best_clusters = run_kmeans(number_of_clusters, tfidf_matrix)
        cluster_terms = get_top_n_terms_per_cluster(km_model, terms, number_of_terms)
    else:
        # Automatically find number of clusters with silhouette_score
        # but have a maximum of max_number_of_groups 
        max_silhouette_score = 0
        for k in range(2, min(max_number_of_groups, len(X))):
            km_model, clusters = run_kmeans(k, tfidf_matrix)
            current_silhouette_score = silhouette_score(tfidf_matrix, clusters)
            if current_silhouette_score > max_silhouette_score:
                max_silhouette_score = current_silhouette_score
                cluster_terms = get_top_n_terms_per_cluster(km_model, terms, number_of_terms)
                best_clusters = clusters
            
    return best_clusters, cluster_terms


def run_model_testing(list_of_groups, df, test_size=50):
    average_homogeneity_score, average_completeness, average_v_measure = 0, 0, 0
    avg_baseline_score = 0

    for group in list_of_groups[:test_size]:
        X_test = list(df[df.label_id.isin(list(group))].Sentence.values)
        y_test = list(df[df.label_id.isin(list(group))].label_id.values)
        n = len(df[df.label_id.isin(list(group))].label_id.unique())
        
        clusters, cluster_terms = run_model(X_test)
        
        average_homogeneity_score += homogeneity_score(y_test, clusters)
        average_completeness += completeness_score(y_test, clusters)
        average_v_measure += v_measure_score(y_test, clusters)
        
        # baseline
        baseline_predictions = np.random.choice(np.arange(1, 5), len(X_test))
        avg_baseline_score += v_measure_score(y_test, baseline_predictions)

    
    print('Test V Measure:     ', average_v_measure/ test_size)
    print('Baseline V Measure: ', avg_baseline_score / test_size)


run_model_testing(list_of_groups, df)

/opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages/ipykernel/__main__.py:8: ConvergenceWarning: Number of distinct clusters (3) found smaller than n_clusters (4). Possibly due to duplicate points in X.

Test V Measure:      0.5073941061159324
Baseline V Measure:  0.16709925217020014


comments_5 = [
    'Customer service was polite.',
    'The socks are a pretty color but expensive.',
    'The shirt I bought was green and service was great.',
    'I think the sweater and socks were perfect.',
    'I do not like the shoes, so ugly and expensive.',
]

comments_20 = [
    'Out of all the products I bought, the shirt was my favorite because it is comfortable. However, the sweater and socks really missed the mark and were not worth it.',
    'My order arrived several days late. But when I contacted customer service they were very helpful and refunded me.',
    'Horrible customer service, I have never met such rude people. Would not recommend at all.',
    'Everything I ordered arrived perfectly on time and looked exactly like in the pictures! This company has high quality products.',
    'The company is okay.',
    'Prices are ridiculous',
    'Way too overpriced.',
    'Can never find anything that fits right',
    'Nice clothes for your teenager.',
    'They were really well organized and made the experience way less stressful than i thought it would be',
    'Friendly staff, good range of clothes.',
    'Fashionable place',
    'Decent quality product! Friendly customer service',
    'I really love all the clothes, beautiful. Just a little too expensive',
    'Great quality of the items, cashier and stocker were very friendly.',
    'Ugly and overpriced.'
    'This place was okay and I did find a couple plain shirts for cheap. Overall disappointed with their selection of basics and prices.',
    'Rude employees. Horrible customer service and limited clothing. Only good thing is cheap clothing',
    'Service was good and I got a lot for my money',
    'My favorite brand',
]


def print_clustering_result(sentences, labels, top_terms):
    label_to_sentences = defaultdict(list)
    for i in range(len(labels)):
        label_to_sentences[labels[i]].append(sentences[i])
    
    number_of_groups = len(label_to_sentences.keys())
    for i in range(number_of_groups):
        print('---------------------------------------')
        print('Group {}. Top Terms: {}'.format(i, top_terms[i]))
        for j in range(len(label_to_sentences[i])):
            print('- ' + label_to_sentences[i][j])


# Example 1
# Automatically determine number of clusters.
best_labels, top_terms = run_model(comments_5)
print_clustering_result(comments_5, best_labels, top_terms)

---------------------------------------
Group 0. Top Terms: ['expensive', 'socks', 'pretty color expensive', 'color', 'color expensive']
- The socks are a pretty color but expensive.
- I think the sweater and socks were perfect.
- I do not like the shoes, so ugly and expensive.
---------------------------------------
Group 1. Top Terms: ['service', 'polite', 'service polite', 'customer', 'customer service']
- Customer service was polite.
- The shirt I bought was green and service was great.


# Example 2
# Can also add the parameter number_of_clusters when running model
best_labels, top_terms = run_model(comments_20, number_of_clusters=3)
print_clustering_result(comments_20, best_labels, top_terms)

---------------------------------------
Group 0. Top Terms: ['favorite', 'way', 'prices', 'place', 'company']
- Out of all the products I bought, the shirt was my favorite because it is comfortable. However, the sweater and socks really missed the mark and were not worth it.
- Everything I ordered arrived perfectly on time and looked exactly like in the pictures! This company has high quality products.
- The company is okay.
- Prices are ridiculous
- Way too overpriced.
- Can never find anything that fits right
- They were really well organized and made the experience way less stressful than i thought it would be
- Fashionable place
- Ugly and overpriced.This place was okay and I did find a couple plain shirts for cheap. Overall disappointed with their selection of basics and prices.
- My favorite brand
---------------------------------------
Group 1. Top Terms: ['service', 'customer', 'customer service', 'quality', 'friendly']
- My order arrived several days late. But when I contacted customer service they were very helpful and refunded me.
- Horrible customer service, I have never met such rude people. Would not recommend at all.
- Decent quality product! Friendly customer service
- Great quality of the items, cashier and stocker were very friendly.
- Rude employees. Horrible customer service and limited clothing. Only good thing is cheap clothing
- Service was good and I got a lot for my money
---------------------------------------
Group 2. Top Terms: ['clothes', 'really', 'good', 'friendly', 'arrived']
- Nice clothes for your teenager.
- Friendly staff, good range of clothes.
- I really love all the clothes, beautiful. Just a little too expensive

	Article Title	Sentence	SectionTitle	Article Link	label	label_id
0	Moeller High School	Moeller's student-run newspaper, The Crusader,...	School publications	https://en.wikipedia.org/wiki/Moeller_High_School	Moeller_High_School:School_publications	3414
1	Moeller High School	In 2008, The Crusader won First Place, the sec...	School publications	https://en.wikipedia.org/wiki/Moeller_High_School	Moeller_High_School:School_publications	3414
2	Moeller High School	The Squire is a student literary journal that ...	School publications	https://en.wikipedia.org/wiki/Moeller_High_School	Moeller_High_School:School_publications	3414
3	Moeller High School	Paul Keels - play-by-play announcer for Ohio S...	Notable alumni	https://en.wikipedia.org/wiki/Moeller_High_School	Moeller_High_School:Notable_alumni	3413
4	Moeller High School	Joe Uecker - Ohio State Senator (R-66) .	Notable alumni	https://en.wikipedia.org/wiki/Moeller_High_School	Moeller_High_School:Notable_alumni	3413

	group
0	[2822, 1492, 2014, 4508, 4393]
1	[535, 2896, 3550, 1670, 2837]
2	[739, 659, 1015, 1362, 3938]
3	[4167, 4753, 1516, 1386, 1705]
4	[3029, 3826, 3057, 3969, 5299]

Modeling Using IBM Debater® Thematic Clustering of Sentences¶

Table of Contents¶

0. Prerequisites¶

Insert a project token¶

Import required modules¶

1. Load Data ¶

2. Modeling ¶

3. Testing ¶

4. Example ¶

Example 1¶

Example 2¶

Authors ¶